Look at the sampling effort for the various WQ parameters to be included in the long-term WQ publication. Here are some notes and issues to consider:
Notable differences between raw_chla_1975_2021 (from
DroughtData) and the data used by the Primary Producers
group:
raw_chla_1975_2021 includes the four EZ stations from
EMPraw_chla_1975_2021 has data for November 2021raw_chla_1975_2021 has additional Chlorophyll_Sign
column that indicates whether the chlorophyll-a value is above or below
the RL. For the 3 values that are below the RL,
raw_chla_1975_2021 reports them as equal to the RL, while
the data from the Primary Producers group reports them as half the
RL.Things to consider with the nutrient, chlorophyll-a, and WQ data sets:
Long_term == FALSE, while those records are removed from
the nutrient and WQ data sets. We may want to make these all
consistent.So far, these are the surveys included in this summary:
# Load packages
library(tidyverse)
library(lubridate)
library(hms)
library(scales)
library(DroughtData)
library(discretewq)
library(deltamapr)
library(sf)
library(dtplyr)
# Run session info to display package versions
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.2.1 (2022-06-23 ucrt)
## os Windows 10 x64 (build 19044)
## system x86_64, mingw32
## ui RTerm
## language (EN)
## collate English_United States.utf8
## ctype English_United States.utf8
## tz America/Los_Angeles
## date 2022-08-10
## pandoc 2.18 @ C:/Program Files/RStudio/bin/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.1)
## backports 1.4.1 2021-12-13 [1] CRAN (R 4.2.0)
## broom 1.0.0 2022-07-01 [1] CRAN (R 4.2.1)
## bslib 0.4.0 2022-07-16 [1] CRAN (R 4.2.1)
## cachem 1.0.6 2021-08-19 [1] CRAN (R 4.2.1)
## callr 3.7.1 2022-07-13 [1] CRAN (R 4.2.1)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.2.1)
## class 7.3-20 2022-01-16 [2] CRAN (R 4.2.1)
## classInt 0.4-7 2022-06-10 [1] CRAN (R 4.2.1)
## cli 3.3.0 2022-04-25 [1] CRAN (R 4.2.1)
## colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.2.1)
## crayon 1.5.1 2022-03-26 [1] CRAN (R 4.2.1)
## data.table 1.14.2 2021-09-27 [1] CRAN (R 4.2.1)
## DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.1)
## dbplyr 2.2.1 2022-06-27 [1] CRAN (R 4.2.1)
## deltamapr * 1.0.0 2021-06-18 [1] Github (InteragencyEcologicalProgram/deltamapr@d0a6f9c)
## devtools 2.4.4 2022-07-20 [1] CRAN (R 4.2.1)
## digest 0.6.29 2021-12-01 [1] CRAN (R 4.2.1)
## discretewq * 2.3.2 2022-08-03 [1] Github (sbashevkin/discretewq@99b3ce8)
## dplyr * 1.0.9 2022-04-28 [1] CRAN (R 4.2.1)
## DroughtData * 1.1.0.9000 2022-06-21 [1] local
## dtplyr * 1.2.1 2022-01-19 [1] CRAN (R 4.2.1)
## e1071 1.7-11 2022-06-07 [1] CRAN (R 4.2.1)
## ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.1)
## evaluate 0.15 2022-02-18 [1] CRAN (R 4.2.1)
## fansi 1.0.3 2022-03-24 [1] CRAN (R 4.2.1)
## fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.1)
## forcats * 0.5.1 2021-01-27 [1] CRAN (R 4.2.1)
## fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.1)
## gargle 1.2.0 2021-07-02 [1] CRAN (R 4.2.1)
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.1)
## ggplot2 * 3.3.6 2022-05-03 [1] CRAN (R 4.2.1)
## glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.1)
## googledrive 2.0.0 2021-07-08 [1] CRAN (R 4.2.1)
## googlesheets4 1.0.0 2021-07-21 [1] CRAN (R 4.2.1)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 4.2.1)
## haven 2.5.0 2022-04-15 [1] CRAN (R 4.2.1)
## hms * 1.1.1 2021-09-26 [1] CRAN (R 4.2.1)
## htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.2.1)
## htmlwidgets 1.5.4 2021-09-08 [1] CRAN (R 4.2.1)
## httpuv 1.6.5 2022-01-05 [1] CRAN (R 4.2.1)
## httr 1.4.3 2022-05-04 [1] CRAN (R 4.2.1)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.2.1)
## jsonlite 1.8.0 2022-02-22 [1] CRAN (R 4.2.1)
## KernSmooth 2.23-20 2021-05-03 [2] CRAN (R 4.2.1)
## knitr 1.39 2022-04-26 [1] CRAN (R 4.2.1)
## later 1.3.0 2021-08-18 [1] CRAN (R 4.2.1)
## lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.2.1)
## lubridate * 1.8.0 2021-10-07 [1] CRAN (R 4.2.1)
## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.1)
## memoise 2.0.1 2021-11-26 [1] CRAN (R 4.2.1)
## mime 0.12 2021-09-28 [1] CRAN (R 4.2.0)
## miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.2.1)
## modelr 0.1.8 2020-05-19 [1] CRAN (R 4.2.1)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.1)
## pillar 1.8.0 2022-07-18 [1] CRAN (R 4.2.1)
## pkgbuild 1.3.1 2021-12-20 [1] CRAN (R 4.2.1)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.1)
## pkgload 1.3.0 2022-06-27 [1] CRAN (R 4.2.1)
## prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.2.1)
## processx 3.7.0 2022-07-07 [1] CRAN (R 4.2.1)
## profvis 0.3.7 2020-11-02 [1] CRAN (R 4.2.1)
## promises 1.2.0.1 2021-02-11 [1] CRAN (R 4.2.1)
## proxy 0.4-27 2022-06-09 [1] CRAN (R 4.2.1)
## ps 1.7.1 2022-06-18 [1] CRAN (R 4.2.1)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.2.1)
## R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.1)
## Rcpp 1.0.9 2022-07-08 [1] CRAN (R 4.2.1)
## readr * 2.1.2 2022-01-30 [1] CRAN (R 4.2.1)
## readxl 1.4.0 2022-03-28 [1] CRAN (R 4.2.1)
## remotes 2.4.2 2021-11-30 [1] CRAN (R 4.2.1)
## reprex 2.0.1 2021-08-05 [1] CRAN (R 4.2.1)
## rlang 1.0.4 2022-07-12 [1] CRAN (R 4.2.1)
## rmarkdown 2.14 2022-04-25 [1] CRAN (R 4.2.1)
## rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.2.1)
## rvest 1.0.2 2021-10-16 [1] CRAN (R 4.2.1)
## sass 0.4.2 2022-07-16 [1] CRAN (R 4.2.1)
## scales * 1.2.0 2022-04-13 [1] CRAN (R 4.2.1)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.1)
## sf * 1.0-8 2022-07-14 [1] CRAN (R 4.2.1)
## shiny 1.7.2 2022-07-19 [1] CRAN (R 4.2.1)
## stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.1)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.2.1)
## tibble * 3.1.8 2022-07-22 [1] CRAN (R 4.2.1)
## tidyr * 1.2.0 2022-02-01 [1] CRAN (R 4.2.1)
## tidyselect 1.1.2 2022-02-21 [1] CRAN (R 4.2.1)
## tidyverse * 1.3.2 2022-07-18 [1] CRAN (R 4.2.1)
## tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.2.1)
## units 0.8-0 2022-02-05 [1] CRAN (R 4.2.1)
## urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.2.1)
## usethis 2.1.6 2022-05-25 [1] CRAN (R 4.2.1)
## utf8 1.2.2 2021-07-24 [1] CRAN (R 4.2.1)
## vctrs 0.4.1 2022-04-13 [1] CRAN (R 4.2.1)
## withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.1)
## xfun 0.31 2022-05-10 [1] CRAN (R 4.2.1)
## xml2 1.3.3 2021-11-30 [1] CRAN (R 4.2.1)
## xtable 1.8-4 2019-04-21 [1] CRAN (R 4.2.1)
## yaml 2.3.5 2022-02-21 [1] CRAN (R 4.2.1)
##
## [1] C:/R/win-library/4.2
## [2] C:/Program Files/R/R-4.2.1/library
##
## ──────────────────────────────────────────────────────────────────────────────
# Define factor level for Region
region_lev <- c(
"North",
"SouthCentral",
"Confluence",
"Suisun Bay",
"Suisun Marsh"
)
# Create a nested data frame and prepare to run the summary plot function on - data in DroughtData
ndf_lt_wq <-
tibble(
Parameter = c(
"Temperature",
"Salinity",
"Secchi",
"DissAmmonia",
"DissNitrateNitrite",
"DissOrthophos",
"Chlorophyll"
),
df_data = c(
rep(list(raw_wq_1975_2021), 3),
rep(list(raw_nutr_1975_2021), 3),
list(raw_chla_1975_2021)
)
) %>%
mutate(
df_data_c = map2(
df_data,
Parameter,
~ drop_na(.x, all_of(.y)) %>%
mutate(
Month = fct_rev(month(Date, label = TRUE)),
Region = factor(Region, levels = region_lev)
)
)
)
# Pull in and prepare the WQ data from discretewq
df_dwq <-
wq(
Sources = c(
"EMP",
"STN",
"FMWT",
"EDSM",
"DJFMP",
"SDO",
"SKT",
"SLS",
"20mm",
"Suisun",
"Baystudy",
"USBR",
"USGS_SFBS",
"YBFMP",
"USGS_CAWSC"
)
) %>%
transmute(
Source,
Station,
Latitude,
Longitude,
Date = date(Date),
# Convert Datetime to PST
Datetime = with_tz(Datetime, tzone = "Etc/GMT+8"),
Temperature,
Salinity,
Secchi,
Chlorophyll,
DissAmmonia,
DissNitrateNitrite,
DissOrthophos
) %>%
filter(
!if_all(
c(Temperature, Salinity, Secchi, Chlorophyll, DissAmmonia, DissNitrateNitrite, DissOrthophos),
is.na
)
)
DroughtData# Create function for sampling effort plots
plot_samp_effort <- function(df, grp_var, yr_var) {
df %>%
count(.data[[yr_var]], Month, .data[[grp_var]], name = "num_samples") %>%
ggplot(aes(x = .data[[yr_var]], y = Month, fill = num_samples)) +
geom_tile() +
facet_grid(rows = vars(.data[[grp_var]]), drop = FALSE) +
scale_x_continuous(breaks = breaks_pretty(20), expand = expansion()) +
scale_y_discrete(drop = FALSE) +
scale_fill_viridis_c(name = "Number of Samples") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "top"
)
}
# Create plots for each Parameter
ndf_lt_wq_plt <- ndf_lt_wq %>%
mutate(plt = map(df_data_c, .f = plot_samp_effort, grp_var = "Region", yr_var = "YearAdj"))
The secchi depth data has a large hole from Jan-Aug in 2017-2021 in the North region, which seems suspicious. Let’s look at the sampling effort for secchi for each station in the North region.
ndf_lt_wq$df_data_c[[3]] %>%
filter(Region == "North") %>%
plot_samp_effort(grp_var = "Station", yr_var = "YearAdj")
Hmmm, the problem seems to be coming from the one current EMP station
(C3A) in the North region. Let’s look at the data directly from the
discretewq package (version 2.3.2) to see if other current
EMP stations have WQ data during the same time period.
# Import EMP station metadata from most recent EDI publication to use for status designation
df_emp_meta <- read_csv("https://portal.edirepository.org/nis/dataviewer?packageid=edi.458.6&entityid=ecf241d54a8335a49f8dfc8813d75609")
# Create vector of stations with active status
emp_sta_active <- df_emp_meta %>%
filter(Status == "Active") %>%
pull(Station)
# Pull out water quality data from EMP dataset within discretewq
emp_wq <- discretewq::EMP %>%
transmute(
Station,
Date = date(Date),
Year = year(Date),
Month = fct_rev(month(Date, label = TRUE)),
Conductivity,
Temperature,
Secchi
) %>%
# Only include active stations
filter(Station %in% emp_sta_active)
# Create a nested data frame and run the summary plot function on each WQ parameter
ndf_emp_wq <-
tibble(
Parameter = c(
"Conductivity",
"Temperature",
"Secchi"
),
df_data = rep(list(emp_wq), 3)
) %>%
mutate(
df_data_c = map2(df_data, Parameter, ~ drop_na(.x, all_of(.y))),
plt = map(df_data_c, .f = plot_samp_effort, grp_var = "Station", yr_var = "Year")
)
It looks like only the shore-based stations (C3A, C10A, C9) have the issue of missing secchi depth data in recent years. Sarah Perry confirmed that EMP doesn’t collect secchi depth data at their shore-based stations anymore, so that explains what’s going on here.
discretewqNow that we’ve been making updates to the WQ data in the discretewq
package, let’s take a look at which surveys we can use for the
long-term WQ publication. First, we’ll look at the temporal scale of all
of the surveys available.
df_dwq %>%
group_by(Source) %>%
summarize(min_date = min(Date), max_date = max(Date)) %>%
arrange(min_date)
## # A tibble: 15 × 3
## Source min_date max_date
## <chr> <date> <date>
## 1 FMWT 1967-09-12 2021-12-16
## 2 USGS_SFBS 1969-04-10 2022-05-09
## 3 STN 1969-07-05 2021-08-19
## 4 USGS_CAWSC 1971-03-02 2021-03-10
## 5 EMP 1975-01-07 2021-12-16
## 6 DJFMP 1976-05-13 2021-12-29
## 7 Suisun 1979-05-16 2021-08-25
## 8 Baystudy 1980-01-23 2020-12-01
## 9 20mm 1995-04-24 2021-07-16
## 10 SDO 1997-08-04 2018-11-08
## 11 YBFMP 1998-01-19 2021-12-30
## 12 SKT 2002-01-07 2021-04-29
## 13 SLS 2009-01-05 2021-03-17
## 14 USBR 2012-05-08 2019-10-22
## 15 EDSM 2016-12-15 2021-11-26
Overall, for all parameters, it looks like FMWT, USGS_SFBS, STN, USGS_CAWSC, EMP, DJFMP, Suisun, and Baystudy have adequate temporal coverage for the long-term analysis. Now let’s take a look at the spatial coverage.
# Only include surveys with adequate temporal coverage
df_dwq_lt <- df_dwq %>%
filter(Source %in% c("FMWT", "USGS_SFBS", "STN", "USGS_CAWSC", "EMP", "DJFMP", "Suisun", "Baystudy"))
# Pull out station coordinates and convert to sf object
sf_dwq_lt <- df_dwq_lt %>%
distinct(Source, Station, Latitude, Longitude) %>%
drop_na(Latitude, Longitude) %>%
st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
st_transform(st_crs(R_EDSM_Subregions_Mahardja_FLOAT))
# Load Delta Shapefile from Brian and only keep SubRegions east of Carquinez Straight
sf_delta <- R_EDSM_Subregions_Mahardja_FLOAT %>%
filter(
!SubRegion %in% c(
"Carquinez Strait",
"Lower Napa River",
"San Francisco Bay",
"San Pablo Bay",
"South Bay",
"Upper Napa River"
)
) %>%
select(SubRegion)
# Plot all stations over the SubRegions
ggplot() +
geom_sf(data = sf_delta, fill = "green", alpha = 0.5) +
geom_sf(data = sf_dwq_lt)
# Assign SubRegions to the stations
sf_dwq_lt_reg <- sf_dwq_lt %>%
st_join(sf_delta, join = st_intersects) %>%
filter(!is.na(SubRegion))
# Remove SubRegions without stations from sf_delta
sf_delta_c <- sf_delta %>% filter(SubRegion %in% unique(sf_dwq_lt_reg$SubRegion))
# Plot all stations over the SubRegions again - with limited ranges
ggplot() +
geom_sf(data = sf_delta_c, fill = "blue", alpha = 0.5) +
geom_sf(data = sf_dwq_lt_reg)
All but one of the SubRegions east of Carquinez Straight has at least one station from a long term survey within it. Now let’s take a closer look at the temporal data coverage for each Station and parameter.
# Prepare long-term data from discretewq
df_dwq_lt_c <- df_dwq_lt %>%
# Remove records without latitude-longitude coordinates
drop_na(Latitude, Longitude) %>%
# Convert to sf object
st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326, remove = FALSE) %>%
# Change to crs of sf_delta_c
st_transform(crs = st_crs(sf_delta_c)) %>%
# Add subregions
st_join(sf_delta_c, join = st_intersects) %>%
# Remove any data outside our subregions of interest
filter(!is.na(SubRegion)) %>%
# Drop sf geometry column since it's no longer needed
st_drop_geometry() %>%
# Add variables for adjusted calendar year, month, and season
# Adjusted calendar year: December-November, with December of the previous calendar year
# included with the following year
mutate(
Month = month(Date),
YearAdj = if_else(Month == 12, year(Date) + 1, year(Date)),
Season = case_when(
Month %in% 3:5 ~ "Spring",
Month %in% 6:8 ~ "Summer",
Month %in% 9:11 ~ "Fall",
Month %in% c(12, 1, 2) ~ "Winter"
)
) %>%
# Restrict data to 1975-2021
filter(YearAdj %in% 1975:2021)
plot_samp_effort_sta <- function(df, param) {
df_param <- df %>% drop_na(.data[[param]])
# Look for any instances when more than 1 data point was collected at a station-day
df_dups <- df_param %>%
count(Station, Date) %>%
filter(n > 1) %>%
select(-n)
# Fix duplicates if there are any
if (length(df_dups > 0)) {
df_dups_fixed <- df_param %>%
inner_join(df_dups, by = c("Station", "Date")) %>%
drop_na(Datetime) %>%
mutate(
# Create variable for time
Time = as_hms(Datetime),
# Calculate difference from noon for each data point for later filtering
Noon_diff = abs(hms(hours = 12) - Time)
) %>%
# Use dtplyr to speed up operations
lazy_dt() %>%
group_by(Station, Date) %>%
# Select only 1 data point per station and date, choose data closest to noon
filter(Noon_diff == min(Noon_diff)) %>%
# When points are equidistant from noon, select earlier point
filter(Time == min(Time)) %>%
ungroup() %>%
# End dtplyr operation
as_tibble() %>%
select(-c(Time, Noon_diff))
df_param <- df_param %>%
anti_join(df_dups, by = c("Station", "Date")) %>%
bind_rows(df_dups_fixed)
}
# Create summary heat map of sampling effort for each station
df_param %>%
count(Station, YearAdj, name = "num_samples") %>%
ggplot(aes(x = YearAdj, y = Station, fill = num_samples)) +
geom_tile() +
scale_x_continuous(breaks = breaks_pretty(20), expand = expansion()) +
scale_fill_viridis_c(name = "Number of Samples") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "top"
)
}
# Nest long-term data from discretewq by Source
ndf_dwq_lt_c <- df_dwq_lt_c %>% nest(df_data = -Source)
# Create a nested data frame to run the summary plot function on
ndf_dwq_lt_c <-
tibble(
Parameter = c(
"Temperature",
"Salinity",
"Secchi",
"DissAmmonia",
"DissNitrateNitrite",
"DissOrthophos",
"Chlorophyll"
),
df_data = rep(list(ndf_dwq_lt_c), 7)
) %>%
unnest(df_data)
# Create plots for each Parameter and Source
ndf_dwq_lt_plt <- ndf_dwq_lt_c %>%
mutate(plt = map2(df_data, Parameter, .f = plot_samp_effort_sta)) %>%
select(-df_data) %>%
nest(plts = -Parameter)